underscore_to_space <- function(x) str_replace_all(x, "_", " ")
outliers <- read_tsv("../input_data/druggable_outliers_from_treehouse_and_other_cohorts_2023_11_09-13_46_32_2023.tsv")
## Rows: 287 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Sample_ID, comparison_cohort, gene, donor_ID
## lgl (1): pathway_support
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Define cohort codes

cohort_codes <- tibble(
  cohort_name = 
    c("PEDAYA", "TCGA", "TH03_TH34", "Treehouse_pc", "Treehouse_pd"),
  cohort_code = 
    c("P", "T", "S", "C", "D"))

Tile plot of all outliers

ggplot(outliers) +
  geom_tile(aes(x=comparison_cohort,
                y=gene, 
                fill = comparison_cohort)) +
  facet_wrap(~Sample_ID,
             nrow = 1) +
  theme(#axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    axis.text.x = element_blank(),
    strip.text.x = element_text(angle = 90),
        ) +
  xlab("")  +
  scale_fill_bright()

Heatmap shows number of cohorts in which outlier were detected

I can make this look better if we decide to use it, but it’s non-trivial

outliers_heatmap_data <- outliers %>%
  group_by(Sample_ID, gene) %>%
  summarize(n_outliers = n()) 
## `summarise()` has grouped output by 'Sample_ID'. You can override using the
## `.groups` argument.
ggplot(outliers_heatmap_data) +
  geom_tile(aes(x=Sample_ID,
                y=gene,
                fill = n_outliers), 
            color = "black")  +
  #theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 

library(ggVennDiagram)
raw_outliers_for_venn <- outliers %>%
  mutate(sample_gene = paste(Sample_ID, gene, sep = "_")) %>%
  arrange(comparison_cohort) %>%
  select(sample_gene, comparison_cohort) %>%
  group_split(comparison_cohort)


list_of_outliers_for_venn <-  lapply(raw_outliers_for_venn, function(x) x %>% pull(sample_gene))
names(list_of_outliers_for_venn) <- unique(outliers$comparison_cohort) %>% sort

ggVennDiagram(list_of_outliers_for_venn,
              show_intersect = TRUE)
## Warning in geom_text(aes_string(label = "count", text = "text"), x =
## label_coord[, : Ignoring unknown aesthetics: text
ggVennDiagram(list_of_outliers_for_venn) + 
  scale_fill_distiller(palette = "Reds", direction = 1)

Annotate with combined full cohort names

collapse_fun <- function(x){ paste(x,collapse = ", ") }

all_outliers_combined_wide <- outliers %>%
  select(-pathway_support, -donor_ID) %>%
  pivot_wider(names_from = Sample_ID,
              values_from = comparison_cohort,
              values_fn = collapse_fun)

n_distinct(outliers$Sample_ID)
## [1] 34
n_distinct(outliers$gene)
## [1] 56
all_outliers_combined_long <- all_outliers_combined_wide %>%
  pivot_longer(-gene,
               names_to = "Sample_ID",
               values_to = "comparison_cohorts") %>%
  na.omit()

How many outliers are present in each combination of cohorts?

tabyl(all_outliers_combined_long,
      comparison_cohorts) %>%
  arrange(desc(n)) %>%
  adorn_pct_formatting() %>%
  adorn_totals() %>%
  kbl() %>%
  kable_styling(full_width = F)
comparison_cohorts n percent
TCGA, Treehouse_pc 27 20.8%
TCGA 21 16.2%
Treehouse_pd 13 10.0%
TCGA, TH03_TH34, Treehouse_pc 12 9.2%
TH03_TH34 11 8.5%
PEDAYA, TCGA, TH03_TH34, Treehouse_pc 9 6.9%
TCGA, TH03_TH34, Treehouse_pc, Treehouse_pd 8 6.2%
PEDAYA, TCGA, TH03_TH34, Treehouse_pc, Treehouse_pd 7 5.4%
PEDAYA 5 3.8%
TCGA, Treehouse_pc, Treehouse_pd 4 3.1%
TCGA, TH03_TH34 3 2.3%
TCGA, Treehouse_pd 3 2.3%
PEDAYA, TCGA, Treehouse_pc, Treehouse_pd 2 1.5%
PEDAYA, TCGA, TH03_TH34 1 0.8%
PEDAYA, TCGA, Treehouse_pc 1 0.8%
PEDAYA, Treehouse_pc 1 0.8%
TH03_TH34, Treehouse_pc 1 0.8%
TH03_TH34, Treehouse_pd 1 0.8%
Total 130
ggplot(all_outliers_combined_long) +
  geom_tile(aes(x=Sample_ID,
                y=gene,
                fill = comparison_cohorts))  +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

n_distinct(all_outliers_combined_long$Sample_ID)                
## [1] 34

Annotate with minimal combined cohort abbreviations

collapse_fun_no_coma <- function(x){ paste(x,collapse = "") }

# backslashes prevent asterisks from being interpreted as italics in the kbl table

all_outliers_min_abbrev_combined_wide <- outliers %>%
  left_join(cohort_codes,
            by=c("comparison_cohort"="cohort_name")) %>%
  mutate(cohort_code_pathway = ifelse(pathway_support,
                                      paste0(cohort_code, "\\*"),
                                      cohort_code)) %>%
  select(-pathway_support, -donor_ID,
         -comparison_cohort,
         -cohort_code) %>%
  pivot_wider(names_from = Sample_ID,
              values_from = cohort_code_pathway,
              values_fn = collapse_fun_no_coma,
              values_fill = "")


all_outliers_min_abbrev_combined_wide %>%
  arrange(gene) %>%
  rename_all(underscore_to_space) %>%
  kbl() %>%
  kable_styling(full_width = F,
                bootstrap_options = "bordered")
gene TH34 1162 S01 TH34 1149 S02 TH34 1238 S01 TH34 1349 S01 TH34 1349 S02 TH34 1379 S01 TH34 1380 S01 TH34 1150 S02 TH34 1399 S01 TH34 1400 S01 TH34 1412 S01 TH34 1414 S01 TH34 1415 S01 TH34 1444 S01 TH34 1452 S01 TH34 2292 S01 TH34 2351 S01 TH34 2411 S01 TH34 2666 S01 TH34 1163 S01 TH34 1179 S01 TH34 1239 S01 TH34 1350 S01 TH34 1351 S01 TH34 1352 S01 TH34 1381 S01 TH34 1445 S02 TH34 1446 S01 TH34 1447 S01 TH34 1447 S02 TH34 1455 S01 TH34 1456 S02 TH34 2293 S01 TH34 2410 S01
AKT1 S*D* S*
AKT2 P*T*S*C*D*
ALK PTSC
BCL6 D T*
BTK T*S*C* T*C* T*C*
CCND1 T*S*
CCND2 D* D
CCND3 T* T*
CCNE1 D*
CDK4 T*S*C* P*T*S*C*D*
CDK9 T* D T*SC
CSF1R D*
DEPTOR S*
ETV1 TC* T*C* T T
FGFR1 T* T*C* T*C*
FGFR2 T*
FGFR3 PT*S* T
FGFR4 P P P* PC P
FLT4 TSCD P*T*S*C* TC
GATA2 T T*CD*
HDAC4 PT*S*CD*
HDAC7 D*
HMOX1 PTCD* T*C*D* P*T*C*D T*C* D* D* T* T*C
HSP90B1 T*S*C*D*
IGF1 P*T*S*C*D P*T*C* P*
IGF2 T*C* T* T*D* TC* T*C* T*C* T*C T*C* T*C* TC T*C* T* T*D* T* TC TC T*C* T*
IL6 P*T*S*C*
JAK1 TSC PTSC
KDR T*S*C* T*
KIT PTSC PTSC P*T*S*C*D*
MAP2K2 S* T*S*C*D*
MAP2K4 TSCD*
MDM2 P*T*S*C*D* PTSC
MS4A1 PT*S*C*
MTOR T*S*
NOTCH3 T*S*C*D*
NTRK2 S* S* S* S* S* S* SC
NTRK3 TCD T*C* TS*C TC
PARP1 D
PARP2 T*C*D* T*
PDCD1 PTSC
PDGFRA T*
PIK3CD T*S*C* T*S*C* T*
PIK3R1 S*
PIK3R2 D* T*S*C*
PIK3R5 T*C* T*C*
PTCH1 T* T*C* T*C*
RAF1 D*
RPTOR T*S*
STAT1 S
STAT2 TSCD
STAT5A D*
TSC2 T*S*C*D* T*S*C* T*
VEGFA T*S*C* T*S*C* T*D*
VEGFC P*T*S*C*D*
WEE1 T*SC*D*

Annotate with combined cohort abbreviations

all_outliers_abbrev_combined_wide <- outliers %>%
  left_join(cohort_codes,
            by=c("comparison_cohort"="cohort_name")) %>%
  select(-pathway_support, -donor_ID,
         -comparison_cohort) %>%
  pivot_wider(names_from = Sample_ID,
              values_from = cohort_code,
              values_fn = collapse_fun,
              values_fill = "")

#Out

all_outliers_abbrev_combined_wide %>%
  arrange(gene) %>%
  rename_all(underscore_to_space) %>%
  kbl() %>%
  kable_styling(full_width = F,
                bootstrap_options = "bordered")
gene TH34 1162 S01 TH34 1149 S02 TH34 1238 S01 TH34 1349 S01 TH34 1349 S02 TH34 1379 S01 TH34 1380 S01 TH34 1150 S02 TH34 1399 S01 TH34 1400 S01 TH34 1412 S01 TH34 1414 S01 TH34 1415 S01 TH34 1444 S01 TH34 1452 S01 TH34 2292 S01 TH34 2351 S01 TH34 2411 S01 TH34 2666 S01 TH34 1163 S01 TH34 1179 S01 TH34 1239 S01 TH34 1350 S01 TH34 1351 S01 TH34 1352 S01 TH34 1381 S01 TH34 1445 S02 TH34 1446 S01 TH34 1447 S01 TH34 1447 S02 TH34 1455 S01 TH34 1456 S02 TH34 2293 S01 TH34 2410 S01
AKT1 S, D S
AKT2 P, T, S, C, D
ALK P, T, S, C
BCL6 D T
BTK T, S, C T, C T, C
CCND1 T, S
CCND2 D D
CCND3 T T
CCNE1 D
CDK4 T, S, C P, T, S, C, D
CDK9 T D T, S, C
CSF1R D
DEPTOR S
ETV1 T, C T, C T T
FGFR1 T T, C T, C
FGFR2 T
FGFR3 P, T, S T
FGFR4 P P P P, C P
FLT4 T, S, C, D P, T, S, C T, C
GATA2 T T, C, D
HDAC4 P, T, S, C, D
HDAC7 D
HMOX1 P, T, C, D T, C, D P, T, C, D T, C D D T T, C
HSP90B1 T, S, C, D
IGF1 P, T, S, C, D P, T, C P
IGF2 T, C T T, D T, C T, C T, C T, C T, C T, C T, C T, C T T, D T T, C T, C T, C T
IL6 P, T, S, C
JAK1 T, S, C P, T, S, C
KDR T, S, C T
KIT P, T, S, C P, T, S, C P, T, S, C, D
MAP2K2 S T, S, C, D
MAP2K4 T, S, C, D
MDM2 P, T, S, C, D P, T, S, C
MS4A1 P, T, S, C
MTOR T, S
NOTCH3 T, S, C, D
NTRK2 S S S S S S S, C
NTRK3 T, C, D T, C T, S, C T, C
PARP1 D
PARP2 T, C, D T
PDCD1 P, T, S, C
PDGFRA T
PIK3CD T, S, C T, S, C T
PIK3R1 S
PIK3R2 D T, S, C
PIK3R5 T, C T, C
PTCH1 T T, C T, C
RAF1 D
RPTOR T, S
STAT1 S
STAT2 T, S, C, D
STAT5A D
TSC2 T, S, C, D T, S, C T
VEGFA T, S, C T, S, C T, D
VEGFC P, T, S, C, D
WEE1 T, S, C, D

Summary table for all outliers

n_outliers_detected_by_any_method <- outliers %>%
  select(Sample_ID, gene) %>%
  distinct %>%
  nrow()

n_outliers_with_pathway_support_detected_by_any_method <- outliers %>%
  filter(pathway_support) %>%
  select(Sample_ID, gene) %>%
  distinct %>%
  nrow()
# these have pathway support in at least one cohort


outlier_summary <- outliers %>% 
  group_by(comparison_cohort) %>%
  summarize(n_outliers_detected = n(),
         n_outliers_with_pathway_support = sum(pathway_support),
         pct_outliers_with_pathway_support = 100*n_outliers_with_pathway_support/n_outliers_detected,
         pct_outliers_detected = 100*n_outliers_detected/n_outliers_detected_by_any_method)

outlier_summary_with_totals <- 
bind_rows(outlier_summary,
          tibble(comparison_cohort= " Total",
                 n_outliers_detected = n_outliers_detected_by_any_method,
                 n_outliers_with_pathway_support = n_outliers_with_pathway_support_detected_by_any_method,
                 pct_outliers_with_pathway_support = 100*n_outliers_with_pathway_support_detected_by_any_method/n_outliers_detected_by_any_method))
                 
                 
  
outlier_summary_with_totals %>% 
  rename_all(underscore_to_space) %>%
  kbl(format.args = list(big.mark = ","), digits = c(NA, 0, 0, 0, 0)) %>%
  kable_styling(full_width = F)
comparison cohort n outliers detected n outliers with pathway support pct outliers with pathway support pct outliers detected
PEDAYA 26 12 46 20
TCGA 98 74 76 75
TH03_TH34 53 39 74 41
Treehouse_pc 72 47 65 55
Treehouse_pd 38 29 76 29
Total 130 101 78 NA

REPEAT ANALYSIS USING ONLY OUTLIERS WITH PATHWAY SUPPORT

Tile plot of outliers with pathway support

ggplot(outliers %>%
         filter(pathway_support)) +
  geom_tile(aes(x=comparison_cohort,
                y=gene, 
                fill = comparison_cohort)) +
  facet_wrap(~Sample_ID,
             nrow = 1) +
  theme(#axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    axis.text.x = element_blank(),
    strip.text.x = element_text(angle = 90),
        ) +
  xlab("")  +
  scale_fill_bright()

Heatmap shows number of cohorts in which outlier were detected

I can make this look better if we decide to use it, but it’s non-trivial

pathway_outliers_heatmap_data <- outliers %>%
  filter(pathway_support) %>%
  group_by(Sample_ID, gene) %>%
  summarize(n_outliers = n()) 
## `summarise()` has grouped output by 'Sample_ID'. You can override using the
## `.groups` argument.
ggplot(pathway_outliers_heatmap_data) +
  geom_tile(aes(x=Sample_ID,
                y=gene,
                fill = n_outliers), 
            color = "black")  +
  #theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) 

raw_pathway_support_outliers_for_venn <- outliers %>%
  filter(pathway_support)  %>%
  mutate(sample_gene = paste(Sample_ID, gene, sep = "_")) %>%
  arrange(comparison_cohort) %>%
  select(sample_gene, comparison_cohort) %>%
  group_split(comparison_cohort)


list_of_pathway_support_outliers_for_venn <-  lapply(raw_pathway_support_outliers_for_venn, function(x) x %>% pull(sample_gene))
names(list_of_pathway_support_outliers_for_venn) <- outliers %>%
  filter(pathway_support) %>%
  arrange(comparison_cohort) %>%
  select(comparison_cohort) %>%
  distinct() %>%
  pull(comparison_cohort)

ggVennDiagram(list_of_pathway_support_outliers_for_venn,
              show_intersect = TRUE)
## Warning in geom_text(aes_string(label = "count", text = "text"), x =
## label_coord[, : Ignoring unknown aesthetics: text
ggVennDiagram(list_of_pathway_support_outliers_for_venn) + 
  scale_fill_distiller(palette = "Reds", direction = 1)

Annotate with combined full cohort names

outliers_with_pathway_support_combined_wide <- outliers %>%
  filter(pathway_support) %>%
  select(-pathway_support, -donor_ID) %>%
  pivot_wider(names_from = Sample_ID,
              values_from = comparison_cohort,
              values_fn = collapse_fun)

outliers_with_pathway_support_combined_long <- outliers_with_pathway_support_combined_wide %>%
  pivot_longer(-gene,
               names_to = "Sample_ID",
               values_to = "comparison_cohorts") %>%
  na.omit()

How many outliers with pathway support are present in each combination of cohorts?

tabyl(outliers_with_pathway_support_combined_long,
      comparison_cohorts) %>%
  arrange(desc(n)) %>%
  adorn_pct_formatting() %>%
  adorn_totals() %>%
  kbl() %>%
  kable_styling(full_width = F)
comparison_cohorts n percent
TCGA 20 19.8%
TCGA, Treehouse_pc 18 17.8%
TH03_TH34 11 10.9%
Treehouse_pd 11 10.9%
TCGA, TH03_TH34, Treehouse_pc 10 9.9%
PEDAYA, TCGA, TH03_TH34, Treehouse_pc, Treehouse_pd 5 5.0%
TCGA, TH03_TH34 4 4.0%
TCGA, TH03_TH34, Treehouse_pc, Treehouse_pd 4 4.0%
TCGA, Treehouse_pd 4 4.0%
PEDAYA, TCGA, TH03_TH34, Treehouse_pc 3 3.0%
TCGA, Treehouse_pc, Treehouse_pd 3 3.0%
PEDAYA 2 2.0%
PEDAYA, TCGA, Treehouse_pc 2 2.0%
Treehouse_pc 2 2.0%
TCGA, TH03_TH34, Treehouse_pd 1 1.0%
TH03_TH34, Treehouse_pd 1 1.0%
Total 101
ggplot(outliers_with_pathway_support_combined_long) +
  geom_tile(aes(x=Sample_ID,
                y=gene,
                fill = comparison_cohorts))  +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

n_distinct(outliers_with_pathway_support_combined_long$Sample_ID)                
## [1] 32

Annotate with combined cohort abbreviations

outliers_with_pathway_support_abbrev_combined_wide <- outliers %>%
  filter(pathway_support) %>%
  left_join(cohort_codes,
            by=c("comparison_cohort"="cohort_name")) %>%
  select(-pathway_support, -donor_ID,
         -comparison_cohort) %>%
  pivot_wider(names_from = Sample_ID,
              values_from = cohort_code,
              values_fn = collapse_fun,
              values_fill = "")

Big table of outliers with pathway support

outliers_with_pathway_support_abbrev_combined_wide %>%
  arrange(gene) %>%
  rename_all(underscore_to_space) %>%
  kbl() %>%
  kable_styling(full_width = F,
                bootstrap_options = "bordered")
gene TH34 1149 S02 TH34 1238 S01 TH34 1399 S01 TH34 1400 S01 TH34 1412 S01 TH34 1415 S01 TH34 1444 S01 TH34 1452 S01 TH34 2292 S01 TH34 2411 S01 TH34 1179 S01 TH34 1239 S01 TH34 1349 S01 TH34 1349 S02 TH34 1350 S01 TH34 1352 S01 TH34 1379 S01 TH34 1380 S01 TH34 1381 S01 TH34 1150 S02 TH34 1414 S01 TH34 1445 S02 TH34 1447 S01 TH34 1447 S02 TH34 1455 S01 TH34 1456 S02 TH34 2293 S01 TH34 2351 S01 TH34 2410 S01 TH34 1446 S01 TH34 1162 S01 TH34 1351 S01
AKT1 S, D S
AKT2 P, T, S, C, D
BCL6 T
BTK T, S, C T, C T, C
CCND1 T, S
CCND2 D
CCND3 T T
CCNE1 D
CDK4 P, T, S, C, D T, S, C
CDK9 T T
CSF1R D
DEPTOR S
ETV1 C T, C
FGFR1 T T, C T, C
FGFR2 T
FGFR3 T, S
FGFR4 P
FLT4 P, T, S, C
GATA2 T, D
HDAC4 T, S, D
HDAC7 D
HMOX1 P, T, C T, C T, C, D D T T D D
HSP90B1 T, S, C, D
IGF1 P, T, S, C P, T, C P
IGF2 T, C T, C T, C T, C T T, C T, D C T T T, D T T, C T, C T
IL6 P, T, S, C
KDR T, S, C T
KIT P, T, S, C, D
MAP2K2 T, S, C, D S
MAP2K4 D
MDM2 P, T, S, C, D
MS4A1 T, S, C
MTOR T, S
NOTCH3 T, S, C, D
NTRK2 S S S S S S
NTRK3 T, C S
PARP2 T, C, D T
PDGFRA T
PIK3CD T, S, C T, S, C T
PIK3R1 S
PIK3R2 D T, S, C
PIK3R5 T, C T, C
PTCH1 T, C T T, C
RAF1 D
RPTOR T, S
STAT5A D
TSC2 T, S, C, D T, S, C T
VEGFA T, S, C T, S, C T, D
VEGFC P, T, S, C, D
WEE1 T, C, D